# Install packages that are not installed in colab
try:
import google.colab
IN_COLAB = True
except:
IN_COLAB = False
if IN_COLAB:
%tensorflow_version 2.X
!pip install --upgrade daft
!pip install causalgraphicalmodels
!pip install watermark
!pip install arviz
%load_ext watermark
# Core
import collections
import numpy as np
import arviz as az
import pandas as pd
import tensorflow as tf
import tensorflow_probability as tfp
import scipy.stats as stats
# visualization
import daft
import matplotlib.pyplot as plt
from causalgraphicalmodels import CausalGraphicalModel
# aliases
tfd = tfp.distributions
tfb = tfp.bijectors
Root = tfd.JointDistributionCoroutine.Root
%watermark -p numpy,tensorflow,tensorflow_probability,arviz,scipy,pandas,daft,causalgraphicalmodels
# config of various plotting libraries
%config InlineBackend.figure_format = 'retina'
az.style.use('arviz-darkgrid')
USE_XLA = False #@param
NUMBER_OF_CHAINS = 2 #@param
NUMBER_OF_BURNIN = 500 #@param
NUMBER_OF_SAMPLES = 500 #@param
NUMBER_OF_LEAPFROG_STEPS = 4 #@param
def _trace_to_arviz(trace=None,
sample_stats=None,
observed_data=None,
prior_predictive=None,
posterior_predictive=None,
inplace=True):
if trace is not None and isinstance(trace, dict):
trace = {k: v.numpy()
for k, v in trace.items()}
if sample_stats is not None and isinstance(sample_stats, dict):
sample_stats = {k: v.numpy().T for k, v in sample_stats.items()}
if prior_predictive is not None and isinstance(prior_predictive, dict):
prior_predictive = {k: v[np.newaxis]
for k, v in prior_predictive.items()}
if posterior_predictive is not None and isinstance(posterior_predictive, dict):
if isinstance(trace, az.InferenceData) and inplace == True:
return trace + az.from_dict(posterior_predictive=posterior_predictive)
else:
trace = None
return az.from_dict(
posterior=trace,
sample_stats=sample_stats,
prior_predictive=prior_predictive,
posterior_predictive=posterior_predictive,
observed_data=observed_data,
)
@tf.function(autograph=False, experimental_compile=USE_XLA)
def run_hmc_chain(init_state,
bijectors,
step_size,
target_log_prob_fn,
num_leapfrog_steps=NUMBER_OF_LEAPFROG_STEPS,
num_samples=NUMBER_OF_SAMPLES,
burnin=NUMBER_OF_BURNIN,
):
def _trace_fn_transitioned(_, pkr):
return (
pkr.inner_results.inner_results.log_accept_ratio
)
hmc_kernel = tfp.mcmc.HamiltonianMonteCarlo(
target_log_prob_fn,
num_leapfrog_steps=num_leapfrog_steps,
step_size=step_size)
inner_kernel = tfp.mcmc.TransformedTransitionKernel(
inner_kernel=hmc_kernel,
bijector=bijectors)
kernel = tfp.mcmc.SimpleStepSizeAdaptation(
inner_kernel=inner_kernel,
target_accept_prob=.8,
num_adaptation_steps=int(0.8*burnin),
log_accept_prob_getter_fn=lambda pkr: pkr.inner_results.log_accept_ratio
)
results, sampler_stat = tfp.mcmc.sample_chain(
num_results=num_samples,
num_burnin_steps=burnin,
current_state=init_state,
kernel=kernel,
trace_fn=_trace_fn_transitioned)
return results, sampler_stat
def sample_posterior(jdc,
observed_data,
params,
init_state=None,
bijectors=None,
step_size = 0.1,
num_chains=NUMBER_OF_CHAINS,
num_samples=NUMBER_OF_SAMPLES,
burnin=NUMBER_OF_BURNIN):
if init_state is None:
init_state = list(jdc.sample(num_chains)[:-1])
if bijectors is None:
bijectors = [tfb.Identity() for i in init_state]
target_log_prob_fn = lambda *x: jdc.log_prob(x + observed_data)
results, sample_stats = run_hmc_chain(init_state,
bijectors,
step_size=step_size,
target_log_prob_fn=target_log_prob_fn,
num_samples=num_samples,
burnin=burnin)
stat_names = ['mean_tree_accept']
sampler_stats = dict(zip(stat_names, [sample_stats]))
transposed_results = []
for r in results:
if len(r.shape) == 2:
transposed_shape = [1,0]
elif len(r.shape) == 3:
transposed_shape = [1,0,2]
else:
transposed_shape = [1,0,2,3]
transposed_results.append(tf.transpose(r, transposed_shape))
posterior = dict(zip(params, transposed_results))
az_trace = _trace_to_arviz(trace=posterior,
sample_stats=sampler_stats)
return posterior, az_trace
# You could change base url to local dir or a remoate raw github content
_BASE_URL = "https://raw.githubusercontent.com/rmcelreath/rethinking/Experimental/data"
WAFFLE_DIVORCE_DATASET_PATH = f"{_BASE_URL}/WaffleDivorce.csv"
MILK_DATASET_PATH = f"{_BASE_URL}/milk.csv"
HOWELL_DATASET_PATH = f"{_BASE_URL}/Howell1.csv"
# A utility method to convert data (columns) from pandas dataframe
# into tensors with appropriate type
def df_to_tensors(name, df, columns, default_type=tf.float32):
""" name : Name of the dataset
df : pandas dataframe
colums : a list of names that have the same type or
a dictionary where keys are the column names and values are the tensorflow type (e.g. tf.float32)
"""
if isinstance(columns,dict):
column_names = columns.keys()
fields = [tf.cast(df[k].values, dtype=v) for k,v in columns.items()]
else:
column_names = columns
fields = [tf.cast(df[k].values, dtype=default_type) for k in column_names]
# build the cls
tuple_cls = collections.namedtuple(name, column_names)
# build the obj
return tuple_cls._make(fields)
The chapter is going to use 2 predictor variables - Marriage Rate & Median Age at Marriage to predict the divorce rate. Here we start with the first predictor variable - Median Age at Marriage
Loading the dataset and standardizing the variables of interest (i.e. median age at marriage and divorce rate)
d = pd.read_csv(WAFFLE_DIVORCE_DATASET_PATH, sep=";")
# standardize variables
d["A"] = d.MedianAgeMarriage.pipe(lambda x: (x - x.mean()) / x.std())
d["D"] = d.Divorce.pipe(lambda x: (x - x.mean()) / x.std())
Here is the description of the linear regression model that use Median Age as the predictor variable.
$D_i \sim Normal(\mu_i,\sigma)$
$\mu_i = \alpha + \beta_AA_i$
$\alpha \sim Normal(0,0.2)$
$\beta_A \sim Normal(0,0.5)$
$\sigma \sim Exponential(1)$
We have standardized both D and A this means that the intercept ($\alpha$) should be close to zero.
A slope ($\beta_A$) of 1 would then imply that a change in one standard deviation in marriage rate is associated with change of one standard deviation in divorce.
Below we compute the standard deviation in median age
d.MedianAgeMarriage.std()
This means that when $\beta_A$ is 1 then we expect a change of 1.2 years in median age at marriage is associated with 1 full standard deviation of the divorce rate.
Define the model and compute the posterior
tdf = df_to_tensors('WaffleDivorce', d, ['A', 'D'])
def model_5_1(median_age_data):
def _generator():
alpha = yield Root(tfd.Sample(tfd.Normal(loc=0., scale=0.2, name="alpha"), sample_shape=1))
betaA = yield Root(tfd.Sample(tfd.Normal(loc=0., scale=0.5, name="betaA"), sample_shape=1))
sigma = yield Root(tfd.Sample(tfd.Exponential(rate=1., name="sigma"), sample_shape=1))
mu = alpha + betaA * median_age_data
divorce = yield tfd.Independent(tfd.Normal(loc=mu, scale=sigma, name="divorce"), reinterpreted_batch_ndims=1)
return tfd.JointDistributionCoroutine(_generator, validate_args=True)
jdc_5_1 = model_5_1(tdf.A)
posterior_5_1, trace_5_1 = sample_posterior(
jdc_5_1,
observed_data=(tdf.D, ),
bijectors=[tfb.Identity(), tfb.Identity(), tfb.Exp()],
params=['alpha', 'betaA', 'sigma'])
Before we do posterior analysis let's check our priors
# Below we are using the value of A (median age)
A = tf.constant([-2,2], dtype=tf.float32)
jdc_5_1_prior = model_5_1(A)
prior_pred_samples = jdc_5_1_prior.sample(1000)
prior_alpha = prior_pred_samples[0]
prior_beta = prior_pred_samples[1]
prior_sigma = prior_pred_samples[2]
ds, samples = jdc_5_1_prior.sample_distributions(value=[
prior_alpha,
prior_beta,
prior_sigma,
None
])
mu = ds[-1].distribution.loc
plt.subplot(xlim=(-2, 2), ylim=(-2, 2))
for i in range(20):
plt.plot([-2, 2], mu[i], "k", alpha=0.4)
plt.xlabel("Median age marriage (std)")
plt.ylabel("Divorce rate (std)");
Now for the posterior predictions.
sample_alpha = posterior_5_1["alpha"][0]
sample_betaA = posterior_5_1["betaA"][0]
sample_sigma = posterior_5_1["sigma"][0]
A_seq = tf.linspace(start=-3., stop=3.2, num=30)
A_seq = tf.cast(A_seq, dtype=tf.float32)
# we are creating a new model for the test values
jdc_5_1_test = model_5_1(A_seq)
# we will sample it using the trace_5_1
ds, samples = jdc_5_1_test.sample_distributions(value=[
sample_alpha,
sample_betaA,
sample_sigma,
None
])
mu_pred = ds[-1].distribution.loc
mu_mean = tf.reduce_mean(mu_pred, 0)
mu_PI = tfp.stats.percentile(mu_pred, q=(5.5, 94.5), axis=0)
# plot it all
az.plot_pair(d[["A", "D"]].to_dict(orient="list"))
plt.fill_between(A_seq, mu_PI[0], mu_PI[1], color="k", alpha=0.2);
plt.plot(A_seq, mu_mean, "k");
Above plot is showing that the influence of age at marriage is strongly negative with divorce rate
We can now model a simple regression for the Mariage rate as well.
d["M"] = d.Marriage.pipe(lambda x: (x - x.mean()) / x.std())
tdf = df_to_tensors('WaffleDivorce', d, ['A', 'D', 'M'])
def model_5_2(marriage_rate_data):
def _generator():
alpha = yield Root(tfd.Sample(tfd.Normal(loc=0., scale=0.2, name="alpha"), sample_shape=1))
betaM = yield Root(tfd.Sample(tfd.Normal(loc=0., scale=0.5, name="betaM"), sample_shape=1))
sigma = yield Root(tfd.Sample(tfd.Exponential(rate=1., name="sigma"), sample_shape=1))
mu = alpha + betaM * marriage_rate_data
divorce = yield tfd.Independent(tfd.Normal(loc=mu, scale=sigma, name="divorce"), reinterpreted_batch_ndims=1)
return tfd.JointDistributionCoroutine(_generator, validate_args=True)
jdc_5_2 = model_5_2(tdf.M)
posterior_5_2, trace_5_2 = sample_posterior(
jdc_5_2,
observed_data=(tdf.D,),
params=['alpha', 'betaM', 'sigma'])
sample_alpha = posterior_5_2["alpha"][0]
sample_betaM = posterior_5_2["betaM"][0]
sample_sigma = posterior_5_2["sigma"][0]
M_seq = tf.linspace(start=-3., stop=3.2, num=30)
M_seq = tf.cast(M_seq, tf.float32)
# we are creating a new model for the test values
jdc_5_2_test = model_5_2(M_seq)
# we will sample it using the trace_5_1
ds, samples = jdc_5_2_test.sample_distributions(value=[
sample_alpha,
sample_betaM,
sample_sigma,
None
])
mu_pred = ds[-1].distribution.loc
mu_mean = tf.reduce_mean(mu_pred, 0)
mu_PI = tfp.stats.percentile(mu_pred, q=(5.5, 94.5), axis=0)
# plot it all
az.plot_pair(d[["M", "D"]].to_dict(orient="list"))
plt.fill_between(M_seq, mu_PI[0], mu_PI[1], color="k", alpha=0.2);
plt.plot(M_seq, mu_mean, "k");
Above plot is showing that the influence of marriage rate is strongly positive with divorce rate
Author suggests that merely comparing parameter means between bivariate regressions is no way to decide which predictor is better. They may be independent, or related or could eliminate other.
How do we understand all this ?
He explains that here we may want to think causally.
Few interesting assumptions (or rather deductions) -
a) Age has a direct impact on Divorce rate as people may grow incompatible with the parter
b) Marriage Rate has a direct impact on Divorce rate for obvious reason (more marriages => more divorce)
c) Finally, Age has an impact on Marriage Rate because there are more young people
Another way to represent above is :
A -> D
M -> D
A -> M
and yet another way is to use DAG (Directed Acyclic Graphs)
dag5_1 = CausalGraphicalModel(
nodes=["A", "D", "M"],
edges=[("A", "D"),
("A", "M"),
("M", "D")])
pgm = daft.PGM()
coordinates = {"A": (0, 0), "D": (1, 1), "M": (2, 0)}
for node in dag5_1.dag.nodes:
pgm.add_node(node, node, *coordinates[node])
for edge in dag5_1.dag.edges:
pgm.add_edge(*edge)
pgm.render()
plt.gca().invert_yaxis()
Above DAG clearly shows that A impacts D directly and indirectly (i.e. via M)
The author used "total influence". What is meant by total is that we need to account for every path from A to D.
MEDIATION - Let's say that A did not directly influenced D; rather it did it via M. This type of relationship is called Mediation
Author raises many interesting questions here. He asks if there is indeed a direct effect of marriage rate or rather is age at marriage just driving both, creating a spurious coorelation between marriage rate and divorce rate
Checking conditional indpendencies
# Note - There is no explicity code section for drawing the second DAG
# but the figure appears in the book and hence drawing it as well
dag5_2 = CausalGraphicalModel(
nodes=["A", "D", "M"],
edges=[("A", "D"),
("A", "M")])
pgm = daft.PGM()
coordinates = {"A": (0, 0), "D": (1, 1), "M": (2, 0)}
for node in dag5_2.dag.nodes:
pgm.add_node(node, node, *coordinates[node])
for edge in dag5_2.dag.edges:
pgm.add_edge(*edge)
pgm.render()
plt.gca().invert_yaxis()
DMA_dag2 = CausalGraphicalModel(
nodes=["A", "D", "M"],
edges=[("A", "D"),
("A", "M")])
all_independencies = DMA_dag2.get_all_independence_relationships()
for s in all_independencies:
if all(t[0] != s[0] or t[1] != s[1] or not t[2].issubset(s[2])
for t in all_independencies if t != s):
print(s)
Checking conditional indpenedencies in the DAG1
DMA_dag1 = CausalGraphicalModel(
nodes=["A", "D", "M"],
edges=[("A", "D"),
("A", "M"),
("M", "D")])
all_independencies = DMA_dag1.get_all_independence_relationships()
for s in all_independencies:
if all(t[0] != s[0] or t[1] != s[1] or not t[2].issubset(s[2])
for t in all_independencies if t != s):
print(s)
Executing above cell should not display anything as in the DAG1 (where all nodes are connected to each other) there are no conditional indpedencies.
def model_5_3(median_age_data, marriage_rate_data):
def _generator():
alpha = yield Root(tfd.Sample(tfd.Normal(loc=0., scale=0.2, name="alpha"), sample_shape=1))
betaA = yield Root(tfd.Sample(tfd.Normal(loc=0., scale=0.5, name="betaA"), sample_shape=1))
betaM = yield Root(tfd.Sample(tfd.Normal(loc=0., scale=0.5, name="betaM"), sample_shape=1))
sigma = yield Root(tfd.Sample(tfd.Exponential(rate=1., name="sigma"), sample_shape=1))
mu = alpha + betaA * median_age_data + betaM * marriage_rate_data
divorce = yield tfd.Independent(tfd.Normal(loc=mu, scale=sigma, name="divorce"), reinterpreted_batch_ndims=1)
return tfd.JointDistributionCoroutine(_generator, validate_args=True)
jdc_5_3 = model_5_3(tdf.A, tdf.M)
posterior_5_3, trace_5_3 = sample_posterior(
jdc_5_3,
observed_data=(tdf.D,),
params=['alpha', 'betaA', 'betaM', 'sigma'])
az.summary(trace_5_3, round_to=2, kind='stats', hdi_prob=0.89)
Let's try to understand the above table generated by arviz.
In the model above, we had include both the predictors i.e. M & A. The weights of M i.e. betaM is approaching zero where as betaA is more or less unchanged.
az.plot_trace(trace_5_3);
Here we will take the help of forest plot to compare the posteriors of various models that we have built so far
coeftab = {"m5.1": trace_5_1,
"m5.2": trace_5_2,
"m5.3": trace_5_3}
az.plot_forest(list(coeftab.values()), model_names=list(coeftab.keys()),combined=True,
var_names=["betaA", "betaM"], hdi_prob=0.89);
Let's learn how to read above plot
m5.1 is our first model where we had used betaA. If we compare it to m5.3 it is more or less at the same place.
whereas,
m5.2 is our second model where we had used betaM. If we compare it to m5.3, it is now closer to zero.
This is another way to read/plot what we saw in the arviz summary in Code 5.10
From all this, we can say that - once we know median age at marriage for a State, there is little or no additional predictive power in also knowing the rate of marriage in that state
Possibly revisit this to do what professor has asked to do instead of just translating the code in the cell
N = 50 # number of simulated States
age = tfd.Normal(loc=0., scale=1.).sample((N,)) # sim A
mar = tfd.Normal(loc=age, scale=1.).sample() # sim A -> M
div = tfd.Normal(loc=age, scale=1.).sample() # sim A -> D
Following few cells are all about how to plot multivariate posteriors.
Previously we had only 1 predictor variable and hence a single scatterplot can convey a lot of information. Addition of regression lines and intervals also helped in estimating the size and uncertainity.
However, in the case multivariate regression we would have use other types of plots.
Author mentions that we need to also know how to select the proper type of plot for a given problem.
He explains 3 types of plots -
Predictor residual plots
Posterior prediction plots
Counterfactual plots
# we are first redefining our first model. It is exactly the same as m5_1
def model_5_4(median_age_data):
def _generator():
alpha = yield Root(tfd.Sample(tfd.Normal(loc=0., scale=0.2, name="alpha"), sample_shape=1))
betaA = yield Root(tfd.Sample(tfd.Normal(loc=0., scale=0.5, name="betaA"), sample_shape=1))
sigma = yield Root(tfd.Sample(tfd.Exponential(rate=1., name="sigma"), sample_shape=1))
mu = alpha + betaA * median_age_data
M = yield tfd.Independent(tfd.Normal(loc=mu, scale=sigma, name="M"), reinterpreted_batch_ndims=1)
return tfd.JointDistributionCoroutine(_generator, validate_args=True)
jdc_5_4 = model_5_4(tdf.A)
posterior_5_4, trace_5_4 = sample_posterior(
jdc_5_4,
observed_data=(tdf.D,),
params=['alpha', 'betaA', 'sigma'])
az.plot_trace(trace_5_4);
First is Predictor residual plot
sample_alpha = posterior_5_4["alpha"][0]
sample_betaA = posterior_5_4["betaA"][0]
sample_sigma = posterior_5_4["sigma"][0]
ds, samples = jdc_5_4.sample_distributions(value=[
sample_alpha,
sample_betaA,
sample_sigma,
None
])
mu_pred = ds[-1].distribution.loc
mu_mean = tf.reduce_mean(mu_pred, 0)
mu_resid = d.M.values - mu_mean
Second is Posterior prediction plots
sample_alpha = posterior_5_3["alpha"][0]
sample_betaA = posterior_5_3["betaA"][0]
sample_betaM = posterior_5_3["betaM"][0]
sample_sigma = posterior_5_3["sigma"][0]
ds, samples = jdc_5_3.sample_distributions(value=[
sample_alpha,
sample_betaA,
sample_betaM,
sample_sigma,
None
])
mu_pred = ds[-1].distribution.loc
mu_mean = tf.reduce_mean(mu_pred, 0)
mu_PI = tfp.stats.percentile(mu_pred, q=(5.5, 94.5), axis=0).numpy()
# get the simulated D
D_sim = samples[-1]
D_PI = tfp.stats.percentile(D_sim, q=(5.5, 94.5), axis=0)
D_sim.shape
Plot predictions against the observed
ax = plt.subplot(ylim=(float(mu_PI.min()), float(mu_PI.max())),
xlabel="Observed divorce", ylabel="Predicted divorce")
plt.plot(d.D, mu_mean, "o")
x = np.linspace(mu_PI.min(), mu_PI.max(), 101)
plt.plot(x, x, "--")
for i in range(d.shape[0]):
plt.plot([d.D[i]] * 2, mu_PI[:, i], "b")
fig = plt.gcf()
The diagonal line shows perfect predictions.
for i in range(d.shape[0]):
if d.Loc[i] in ["ID", "UT", "RI", "ME"]:
ax.annotate(d.Loc[i], (d.D[i], mu_mean[i]), xytext=(-25, -5),
textcoords="offset pixels")
fig
Remember that the above graph is draw using m5_3 that had taken both predictor variables in account.
According to book, above figure shows - "Model under-predicts for States with very high divorce rate while it over-predicts for states with very low divorce rates."
I am not able to interpret that :(
Some states, the ones which are labelled i.e. Idaho (ID), Utah (UT) have lower divorce rates because of religious regions.
N = 100 # number of cases
# x_real as Gaussian with mean 0 and stddev 1
x_real = tfd.Normal(loc=0., scale=1.).sample((N,))
# x_spur as Gaussian with mean=x_real
x_spur = tfd.Normal(loc=x_real, scale=1.).sample()
# y as Gaussian with mean=x_real
y = tfd.Normal(loc=x_real, scale=1.).sample()
# bind all together in data frame
d = pd.DataFrame({"y": y, "x_real": x_real, "x_spur": x_spur})
d
Third type of plot is called Counterfactual Plots
The simplest use of counterfactual plot is to see how the prediction changes as we change only one predictor at a time.
# Reloading the dataset
d = pd.read_csv(WAFFLE_DIVORCE_DATASET_PATH, sep=";")
# standardize variables
d["A"] = d.MedianAgeMarriage.pipe(lambda x: (x - x.mean()) / x.std())
d["D"] = d.Divorce.pipe(lambda x: (x - x.mean()) / x.std())
d["M"] = d.Marriage.pipe(lambda x: (x - x.mean()) / x.std())
tdf = df_to_tensors('WaffleDivorce', d, ['A', 'D', 'M'])
def model_5_3_A(median_age_data, marriage_rate_data):
def _generator():
alpha = yield Root(tfd.Sample(tfd.Normal(loc=0., scale=0.2, name="alpha"), sample_shape=1))
betaA = yield Root(tfd.Sample(tfd.Normal(loc=0., scale=0.5, name="betaA"), sample_shape=1))
betaM = yield Root(tfd.Sample(tfd.Normal(loc=0., scale=0.5, name="betaM"), sample_shape=1))
sigma = yield Root(tfd.Sample(tfd.Exponential(rate=1., name="sigma"), sample_shape=1))
mu = alpha + betaA * median_age_data + betaM * marriage_rate_data
divorce = yield tfd.Independent(tfd.Normal(loc=mu, scale=sigma, name="divorce"), reinterpreted_batch_ndims=1)
return tfd.JointDistributionCoroutine(_generator, validate_args=True)
def model_5_X_M(median_age_data):
def _generator():
alphaM = yield Root(tfd.Sample(tfd.Normal(loc=0., scale=0.2, name="alphaM"), sample_shape=1))
betaAM = yield Root(tfd.Sample(tfd.Normal(loc=0., scale=0.5, name="betaAM"), sample_shape=1))
sigmaM = yield Root(tfd.Sample(tfd.Exponential(rate=1., name="sigmaM"), sample_shape=1))
mu_M = alphaM + betaAM * median_age_data
M = yield tfd.Independent(tfd.Normal(loc=mu_M, scale=sigmaM, name="M"), reinterpreted_batch_ndims=1)
return tfd.JointDistributionCoroutine(_generator, validate_args=True)
## First model
jdc_5_3_A = model_5_3_A(tdf.A, tdf.M)
posterior_5_3_A, trace_5_3_A = sample_posterior(
jdc_5_3_A,
observed_data=(tdf.D,),
params=['alpha', 'betaA', 'betaM', 'sigma'])
## Second model
jdc_5_X_M = model_5_X_M(tdf.A)
posterior_5_X_M, trace_5_X_M = sample_posterior(
jdc_5_X_M,
observed_data=(tdf.M,),
params=['alpha', 'betaA', 'sigma'])
az.plot_trace(trace_5_X_M);
az.plot_trace(trace_5_3_A);
A_seq = tf.linspace(-2.0, 2.0, num=30)
A_seq = tf.cast(A_seq, dtype=tf.float32)
sample5X_alpha = posterior_5_X_M["alpha"][0]
sample5X_betaA = posterior_5_X_M["betaA"][0]
sample5X_sigma = posterior_5_X_M["sigma"][0]
# simulate M using A_seq
jdc_5_X_M_test = model_5_X_M(A_seq)
samples = jdc_5_X_M_test.sample(value=[
sample5X_alpha,
sample5X_betaA,
sample5X_sigma,
None
])
M_sim = samples[-1]
M_sim.shape
# time to simulate D using M
sample53A_alpha = posterior_5_3_A["alpha"][0]
sample53A_betaA = posterior_5_3_A["betaA"][0]
sample53A_betaM = posterior_5_3_A["betaM"][0]
sample53A_sigma = posterior_5_3_A["sigma"][0]
jdc_5_3_A_test = model_5_3_A(A_seq, M_sim)
_, _, _, _, D_sim = jdc_5_3_A_test.sample(value=[
sample53A_alpha,
sample53A_betaA,
sample53A_betaM,
sample53A_sigma,
None
])
D_sim.shape
# display counterfactual predictions
plt.plot(A_seq, tf.reduce_mean(D_sim, 0))
plt.gca().set(ylim=(-2, 2), xlabel="manipulated A", ylabel="counterfactual D")
plt.fill_between(A_seq, *tfp.stats.percentile(D_sim, q=(5.5, 94.5), axis=0),
color="k", alpha=0.2)
plt.title("Total counterfactual effect of A on D");
M_seq = tf.linspace(-2.0, 2.0, num=30)
M_seq = tf.cast(M_seq, dtype=tf.float32)
def compute_D_from_posterior_using_M(m):
mu = sample53A_alpha + sample53A_betaM * m
return tfd.Normal(loc=mu, scale=sample53A_sigma).sample()
D_sim = tf.transpose(tf.squeeze(tf.vectorized_map(compute_D_from_posterior_using_M, M_seq)))
D_sim.shape
# display counterfactual predictions
plt.plot(M_seq, tf.reduce_mean(D_sim, 0))
plt.gca().set(ylim=(-2, 2), xlabel="manipulated M", ylabel="counterfactual D")
plt.fill_between(M_seq, *tfp.stats.percentile(D_sim, q=(5.5, 94.5), axis=0),
color="k", alpha=0.2)
plt.title("Total counterfactual effect of M on D");
Here the author explains how to do above manually in the overthinking box.
Due to the lack of Predictive support in tensorflow prob I had to do it anyways. So refer to above cells.
Previous example used multiple predictors to show case spurious association but sometimes you do need multiple predictor variables as they do have influence on the outcome.
It is possible that one predictor variable is +ively related to outcome whereas the other one is -ively correlated.
We are going to use a new dataset called Milk.
Question that we want to answer is - To what extent energy content of milk, measured in KCal, is related to the percent of the brain mass that is neocortex.
d = pd.read_csv(MILK_DATASET_PATH, sep=";")
d.info()
d.head()
Although not mentioned in the question that was asked above, we are considering the female body mass as well. This is to see the masking that hides the relationships among the variables.
d["K"] = d["kcal.per.g"].pipe(lambda x: (x - x.mean()) / x.std())
d["N"] = d["neocortex.perc"].pipe(lambda x: (x - x.mean()) / x.std())
d["M"] = d.mass.pipe(np.log).pipe(lambda x: (x - x.mean()) / x.std())
tdf = df_to_tensors('Milk', d, ['K', 'N', 'M'])
A Simple Bivariate model between KCal and NeoCortex percent
# N => Standardized Neocortex percent
def model_5_5(neocortex_percent_std):
def _generator():
alpha = yield Root(tfd.Sample(tfd.Normal(loc=0., scale=1., name="alpha"), sample_shape=1))
betaN = yield Root(tfd.Sample(tfd.Normal(loc=0., scale=1., name="betaN"), sample_shape=1))
sigma = yield Root(tfd.Sample(tfd.Exponential(rate=1., name="sigma"), sample_shape=1))
mu = alpha + betaN * neocortex_percent_std
K = yield tfd.Independent(tfd.Normal(loc=mu, scale=sigma, name="K"), reinterpreted_batch_ndims=1)
return tfd.JointDistributionCoroutine(_generator, validate_args=True)
jdc_5_5 = model_5_5(tdf.N)
samples = jdc_5_5.sample()
samples
If you see above displayed samples, you would see lot of NaN. Now tensorflow prob does not complain and/or throw an exception here but it is going to cause lot of pain.
In other words, we must not have this situation i.e. presence of NaN
d["neocortex.perc"]
We have lot of data missing in the neorcortex.perc colums
Drop all the cases with missing values. This is known as COMPLETE CASE ANANLYSIS
dcc = d.iloc[d[["K", "N", "M"]].dropna(how="any", axis=0).index]
# we now have only 17 rows
dcc.describe()
tdcc = df_to_tensors('Milk', dcc, ['K', 'N', 'M'])
jdc_5_5_draft = model_5_5(tdcc.N)
samples = jdc_5_5_draft.sample()
samples
You should not see an NaN in the samples
Investigate if we have proper priors by plotting 50 prior regression lines
xseq = tf.constant([-2.0,2.0], dtype=tf.float32)
sample_alpha, sample_betaN, sample_sigma, _ = model_5_5(xseq).sample(1000)
# we simply compute for the two possible values
mu_pred_0 = sample_alpha + sample_betaN * -2.0
mu_pred_1 = sample_alpha + sample_betaN * 2.0
mu_pred = tf.concat([mu_pred_0, mu_pred_1], axis=1)
plt.subplot(xlim=xseq.numpy(), ylim=xseq.numpy())
for i in range(50):
plt.plot(xseq, mu_pred[i], "k", alpha=0.3)
Above does not look good. We should have better priors for $\alpha$ so that it sticks closer to zero
def model_5_5(neocortex_percent_std):
def _generator():
alpha = yield Root(tfd.Sample(tfd.Normal(loc=0., scale=0.2, name="alpha"), sample_shape=1))
betaN = yield Root(tfd.Sample(tfd.Normal(loc=0., scale=0.5, name="betaN"), sample_shape=1))
sigma = yield Root(tfd.Sample(tfd.Exponential(rate=1., name="sigma"), sample_shape=1))
mu = alpha + betaN * neocortex_percent_std
K = yield tfd.Independent(tfd.Normal(loc=mu, scale=sigma, name="K"), reinterpreted_batch_ndims=1)
return tfd.JointDistributionCoroutine(_generator, validate_args=True)
jdc_5_5 = model_5_5(tdcc.N)
posterior_5_5, trace_5_5 = sample_posterior(
jdc_5_5,
observed_data=(tdcc.K,),
params=['alpha', 'betaN', 'sigma'])
az.summary(trace_5_5, round_to=2, kind='stats', hdi_prob=0.89)
xseq = tf.linspace(start=dcc.N.min() - 0.15, stop=dcc.N.max() + 0.15, num=30)
xseq = tf.cast(xseq, dtype=tf.float32)
sample_alpha = posterior_5_5["alpha"][0]
sample_betaN = posterior_5_5["betaN"][0]
sample_sigma = posterior_5_5["sigma"][0]
jdc_5_5_test = model_5_5(xseq)
ds, samples = jdc_5_5_test.sample_distributions(value=[
sample_alpha,
sample_betaN,
sample_sigma,
None
])
mu_pred = ds[-1].distribution.loc
# summarize samples across cases
mu_mean = tf.reduce_mean(mu_pred, 0)
mu_PI = tfp.stats.percentile(mu_pred, q=(5.5, 94.5), axis=0)
az.plot_pair(dcc[["N", "K"]].to_dict(orient="list"))
plt.plot(xseq, mu_mean, "k")
plt.fill_between(xseq, mu_PI[0], mu_PI[1], color="k", alpha=0.2);
Above plot displays weak association when using simple bivariate regression
We now consider another predictor variable; the adult female body mass
def model_5_6(female_body_mass):
def _generator():
alpha = yield Root(tfd.Sample(tfd.Normal(loc=0., scale=0.2, name="alpha"), sample_shape=1))
betaM = yield Root(tfd.Sample(tfd.Normal(loc=0., scale=0.5, name="betaM"), sample_shape=1))
sigma = yield Root(tfd.Sample(tfd.Exponential(rate=1., name="sigma"), sample_shape=1))
mu = alpha + betaM * female_body_mass
K = yield tfd.Independent(tfd.Normal(loc=mu, scale=sigma, name="K"), reinterpreted_batch_ndims=1)
return tfd.JointDistributionCoroutine(_generator, validate_args=True)
jdc_5_6 = model_5_6(tdcc.M)
posterior_5_6, trace_5_6 = sample_posterior(
jdc_5_6,
observed_data=(tdcc.K,),
params=['alpha', 'betaM', 'sigma'])
az.summary(trace_5_6, round_to=2, kind='stats', hdi_prob=0.89)
Log-mass is negatively correlated with KCal. The influence is stronger than that of neocortex percent but it is in the opposite direction.
Now let's see what happens when we add both the predictor variables to the model
def model_5_7(neocortex_percent_std, female_body_mass):
def _generator():
alpha = yield Root(tfd.Sample(tfd.Normal(loc=0., scale=0.2, name="alpha"), sample_shape=1))
betaN = yield Root(tfd.Sample(tfd.Normal(loc=0., scale=0.5, name="betaN"), sample_shape=1))
betaM = yield Root(tfd.Sample(tfd.Normal(loc=0., scale=0.5, name="betaM"), sample_shape=1))
sigma = yield Root(tfd.Sample(tfd.Exponential(rate=1., name="sigma"), sample_shape=1))
mu = alpha + betaN * neocortex_percent_std + betaM * female_body_mass
K = yield tfd.Independent(tfd.Normal(loc=mu, scale=sigma, name="K"), reinterpreted_batch_ndims=1)
return tfd.JointDistributionCoroutine(_generator, validate_args=True)
jdc_5_7 = model_5_7(tdcc.N, tdcc.M)
posterior_5_7, trace_5_7 = sample_posterior(jdc_5_7,
observed_data=(tdcc.K,),
params=['alpha', 'betaN', 'betaM', 'sigma'])
az.summary(trace_5_7, round_to=2, kind='stats', hdi_prob=0.89)
By including both predictor variables in the regression, the posterior assocition of both with the outcome has increased.
coeftab = {"m5.5": trace_5_5,
"m5.6": trace_5_6,
"m5.7": trace_5_7}
az.plot_forest(list(coeftab.values()), model_names=list(coeftab.keys()),combined=True,
var_names=["betaM", "betaN"], hdi_prob=0.89);
Counterfactual plot when setting N to zero and using M as the predictor variable
xseq = tf.linspace(start=dcc.M.min() - 0.15, stop=dcc.M.max() + 0.15, num=30)
xseq = tf.cast(xseq, dtype=tf.float32)
sample_alpha = posterior_5_7["alpha"][0]
sample_betaN = posterior_5_7["betaN"][0]
sample_betaM = posterior_5_7["betaN"][0]
sample_sigma = posterior_5_7["sigma"][0]
mu_pred = tf.transpose(tf.squeeze(tf.vectorized_map(lambda x : sample_alpha + sample_betaN * x, xseq)))
# summarize samples across cases
mu_mean = tf.reduce_mean(mu_pred, 0)
mu_PI = tfp.stats.percentile(mu_pred, q=(5.5, 94.5), axis=0)
plt.subplot(xlim=(dcc.M.min(), dcc.M.max()), ylim=(dcc.K.min(), dcc.K.max()))
plt.plot(xseq, mu_mean, "k")
plt.fill_between(xseq, mu_PI[0], mu_PI[1], color="k", alpha=0.2);
# M -> K <- N
# M -> N
n = 100
M = tfd.Normal(loc=0., scale=1.).sample((n,))
N = tfd.Normal(loc=M, scale=1.).sample()
K = tfd.Normal(loc=N - M, scale=1.).sample()
d_sim = pd.DataFrame({"K": K, "N": N, "M": M})
d_sim
# M -> K <- N
# N -> M
n = 100
N = tfd.Normal(loc=0., scale=1.).sample((n,))
M = tfd.Normal(loc=N, scale=1.).sample()
K = tfd.Normal(loc=N - M, scale=1.).sample()
d_sim2 = pd.DataFrame({"K": K, "N": N, "M": M})
# M -> K <- N
# M <- U -> N
n = 100
N = tfd.Normal(loc=0., scale=1.).sample((n,))
M = tfd.Normal(loc=M, scale=1.).sample()
K = tfd.Normal(loc=N - M, scale=1.).sample()
d_sim3 = pd.DataFrame({"K": K, "N": N, "M": M})
The book uses dagitty an R package that can draw the DAG as well generate the equivalentDAGs given a DAG as input
At this point I do not have the corresponding function in python so I am manually specifying the equivalent DAGs (after seeing the result in RStudio as the book does not display the equivalentDAGs either)
coordinates = {"M": (0, 0.5), "K": (1, 1), "N": (2, 0.5)}
node_names = ["M", "K", "N"]
equivalent_dags = [
[("M", "K"), ("N", "K"), ("M", "N")],
[("M", "K"), ("K", "N"), ("M", "N")],
[("M", "K"), ("N", "K"), ("N", "M")],
[("K", "M"), ("K", "N"), ("M", "N")],
[("K", "M"), ("N", "K"), ("N", "M")],
[("K", "M"), ("K", "N"), ("N", "M")]
]
for e in equivalent_dags:
dag = CausalGraphicalModel(nodes=node_names, edges=e)
pgm = daft.PGM()
for node in dag.dag.nodes:
pgm.add_node(node, node, *coordinates[node])
for edge in dag.dag.edges:
pgm.add_edge(*edge)
pgm.render()
plt.gca().invert_yaxis()
This section discusses Categorical Variables i.e. Discrete & Unordered variables.
We will again use HOWELL dataset here. In previous chapter, we had taken into consider sex variable and here we will model it as a categorical variable.
To start with let's load the dataset
d = pd.read_csv(HOWELL_DATASET_PATH, sep=";")
d.info()
d.head()
Here we are considering following model
$h_i \sim Normal(\mu_i,\sigma)$
$\mu_i = \alpha + \beta_mm_i$
$\alpha \sim Normal(178,20)$
$\beta_m \sim Normal(0,10)$
$\sigma \sim Exponential(1)$
m here is a categorical value and is 1 when we are dealing with males. This means that $\alpha$ is used to predict both female and male heights.
But if we have a male then the height gets an extra $\beta_m$. This also means that $\alpha$ does not represent the average of all samples but rather the average of female height.
mu_female = tfd.Normal(loc=178., scale=20.).sample((int(1e4),))
diff = tfd.Normal(loc=0., scale=10.).sample((int(1e4),))
mu_male = tfd.Normal(loc=178.0, scale=20.).sample((int(1e4),)) + diff
sample_dict = {
'mu_female' : mu_female.numpy(),
'mu_male' : mu_male.numpy()
}
az.summary(az.from_dict(sample_dict), round_to=2, kind='stats', hdi_prob=0.89)
The prior for male is wider (see std column). This is because it uses both the parameters.
Author explains that this makes assigning sensible priors harder. As show above, one of the category would have more uncertainity in the prediction as two priors are involved.
Another approach is to use INDEX VARIABLE
d["sex"] = np.where(d.male.values == 1, 1, 0)
d.sex
tdf = df_to_tensors('Howell1', d, {
'height' : tf.float32,
'sex' : tf.int32
})
def model_5_8(sex):
def _generator():
alpha = yield Root(tfd.Sample(tfd.Normal(loc=178., scale=20., name="alpha"), sample_shape=(2,)))
sigma = yield Root(tfd.Sample(tfd.Uniform(low=0., high=50., name="sigma"), sample_shape=1))
# took long time to figure this out
mu = tf.transpose(tf.gather(tf.transpose(alpha), sex))
height = yield tfd.Independent(tfd.Normal(loc=mu, scale=sigma, name="height"), reinterpreted_batch_ndims=1)
return tfd.JointDistributionCoroutine(_generator, validate_args=True)
jdc_5_8 = model_5_8(tdf.sex)
posterior_5_8, trace_5_8 = sample_posterior(
jdc_5_8,
observed_data=(tdf.height,),
params=['alpha', 'sigma'])
# now here is the special thing we have to do here
# alpha is actually a vector so for az_trace to display
# it properly I would have to extract it properly
female_alpha, male_alpha = tf.split(posterior_5_8["alpha"][0], 2, axis=1)
# reformatted
posterior_5_8_new = {
"female_alpha" : female_alpha.numpy(),
"male_alpha" : male_alpha.numpy(),
"sigma" : posterior_5_8["sigma"][0]
}
az.summary(az.from_dict(posterior_5_8_new), round_to=2, kind='stats', hdi_prob=0.89)
What is the expected difference between females and males ?
We can use the samples from the posterior to compute this.
posterior_5_8_new.update({
"diff_fm" : female_alpha.numpy() - male_alpha.numpy()
})
az.summary(az.from_dict(posterior_5_8_new), round_to=2, kind='stats', hdi_prob=0.89)
Note that the above computation (i.e. diff) appeared in the arviz summary. This kind of calculation is called a CONTRAST.
We explore the mile dataset again. We are now interested in clade variable which encodes the broad taxonomic membership of each species
d = pd.read_csv(MILK_DATASET_PATH, sep=";")
d.clade.unique()
d["clade_id"] = d.clade.astype("category").cat.codes
d["clade_id"]
d.clade
Based on above it seems that pandas is first sorting the categories by name and then assigning the number. So,
We will now model again using the index variables
d["K"] = d["kcal.per.g"].pipe(lambda x: (x - x.mean()) / x.std())
tdf = df_to_tensors('Milk', d, {
'K' : tf.float32,
'clade_id' : tf.int64
})
CLADE_ID_LEN = len(set(d.clade_id.values))
def model_5_9(clade_id):
def _generator():
alpha = yield Root(tfd.Sample(tfd.Normal(loc=0., scale=.5, name="alpha"), sample_shape=(CLADE_ID_LEN,)))
sigma = yield Root(tfd.Sample(tfd.Exponential(rate=1., name="sigma"), sample_shape=1))
mu = tf.transpose(tf.gather(tf.transpose(alpha), tf.cast(clade_id, dtype=tf.int64)))
K = yield tfd.Independent(tfd.Normal(loc=mu, scale=sigma, name="K"), reinterpreted_batch_ndims=1)
return tfd.JointDistributionCoroutine(_generator, validate_args=True)
jdc_5_9 = model_5_9(tdf.clade_id)
posterior_5_9, trace_5_9 = sample_posterior(
jdc_5_9,
observed_data=(tdf.K,),
params=['alpha', 'sigma'])
# now here is the special thing we have to do here
# alpha is actually a vector so for az_trace to display
# it properly I would have to extract it properly
ape_alpha, nwm_alpha, owm_alpha, strep_alpha = tf.split(posterior_5_9["alpha"][0], 4, axis=1)
# reformatted
updated_posterior_5_9 = {
"ape_alpha" : ape_alpha.numpy(),
"nwm_alpha" : nwm_alpha.numpy(),
"owm_alpha" : owm_alpha.numpy(),
"strep_alpha" : strep_alpha.numpy(),
"sigma" : posterior_5_9["sigma"][0]
}
az.summary(az.from_dict(updated_posterior_5_9), round_to=2, kind='stats', hdi_prob=0.89)
labels = ["a[" + str(i) + "]:" + s
for i, s in enumerate(sorted(d.clade.unique()))]
az.plot_forest({"a" : trace_5_9.posterior["alpha"].values[0][None, ...]}, hdi_prob=0.89)
plt.gca().set(yticklabels=labels[::-1], xlabel="expected kcal (std)");
np.random.seed(63)
d["house"] = np.random.choice(np.repeat(np.arange(4), 8), d.shape[0], False)
d.house
tdf = df_to_tensors('Milk', d, {
'K' : tf.float32,
'clade_id' : tf.int32,
'house' : tf.int32
})
CLADE_ID_LEN = len(set(d.clade_id.values))
HOUSE_ID_LEN = len(set(d.house.values))
def model_5_10(clade_id, house_id):
def _generator():
alpha = yield Root(tfd.Sample(tfd.Normal(loc=0., scale=.5, name="alpha"), sample_shape=(CLADE_ID_LEN,)))
house = yield Root(tfd.Sample(tfd.Normal(loc=0., scale=.5, name="house"), sample_shape=(HOUSE_ID_LEN,)))
sigma = yield Root(tfd.Sample(tfd.Exponential(rate=1., name="sigma"), sample_shape=1))
mu = tf.transpose(tf.gather(tf.transpose(alpha), tf.cast(clade_id, dtype=tf.int64))) + tf.transpose(tf.gather(tf.transpose(house), house_id))
K = yield tfd.Independent(tfd.Normal(loc=mu, scale=sigma, name="K"), reinterpreted_batch_ndims=1)
return tfd.JointDistributionCoroutine(_generator, validate_args=True)
jdc_5_10 = model_5_10(tdf.clade_id, tdf.house)
posterior_5_10, trace_5_10 = sample_posterior(
jdc_5_10,
observed_data=(tdf.K,),
params=['alpha', 'house', 'sigma'])
# now here is the special thing we have to do here
# alpha is actually a vector so for az_trace to display
# it properly I would have to extract it properly
ape_alpha, nwm_alpha, owm_alpha, strep_alpha = tf.split(posterior_5_10["alpha"][0], 4, axis=1)
house_0, house_1, house_2, house_3 = tf.split(posterior_5_10["house"][0], 4, axis=1)
# reformatted
updated_posterior_5_10 = {
"ape_alpha" : ape_alpha.numpy(),
"nwm_alpha" : nwm_alpha.numpy(),
"owm_alpha" : owm_alpha.numpy(),
"strep_alpha" : strep_alpha.numpy(),
"house_0" : house_0.numpy(),
"house_1" : house_1.numpy(),
"house_2" : house_2.numpy(),
"house_3" : house_3.numpy(),
"sigma" : posterior_5_10["sigma"][0]
}
az.summary(az.from_dict(updated_posterior_5_10), round_to=2, kind='stats', hdi_prob=0.89)